Configure all file paths based on the Rmarkdown parameter stage .

# things that never become expressed or bound
# grep -f L3.classD.wbid L1.classD.wbid  > L1L3.class.wbid
# grep -f L1L3.class.wbid LE.classD.wbid  > LEL1L3.classQ.wbid
classQ.wbid = read.table("../03_output/LEL1L3.classQ.wbid")[[1]]

rob.dir = normalizePath('~/work/ELT-2-ChIP-revision/Rob/03_emb_L1_L3_intestine_RNAseq/03_output')

rob.files = list(LE='res_embryoGFPplus_vs_embryoGFPminus_apeglm.csv',
                 L1='res_L1GFPplus_vs_L1GFPminus_apeglm.csv',
                 L3='res_L3GFPplus_vs_L3GFPminus_apeglm.csv')

RNASEQ = file.path(rob.dir, rob.files[[params$stage]])


UPSTREAM=1000
DOWNSTREAM=200
IDR_BED = sprintf("../01_input/ELT2_%s_combined_IDR.bed", params$stage) # peaks input file
OUTPUT_03 = normalizePath("../03_output") 

# output files from genomic ranges
PROMOTOR_BED_PATH = sprintf("%s/filtered.promoters.minus%d_plus%d.bed", 
                            OUTPUT_03, 
                            UPSTREAM, 
                            DOWNSTREAM)
# colliding promoters removed
NR_PROMOTOR_BED_PATH = sprintf("%s/nr.promoters.minus%d_plus%d.bed",
                               OUTPUT_03, 
                               UPSTREAM, 
                               DOWNSTREAM)

# input signal file for wiggle tool step
SIGNAL_BW = sprintf("../01_input/ELT2_%s_combined_subtracted.bw", params$stage)

INTERP_SIGNAL_BW = sprintf("../01_input/ELT2_%s_combined_subtracted.interp.bw", params$stage)

# output files from wiggle tool step
PROMOTOR_DF_PATH = sprintf("%s/%s.filtered.promoters.minus%d_plus%d.df", 
                           OUTPUT_03, 
                           params$stage,
                           UPSTREAM, 
                           DOWNSTREAM)

NR_PROMOTOR_DF_PATH = sprintf("%s/%s.nr.promoters.minus%d_plus%d.df", 
                           OUTPUT_03, 
                           params$stage,
                           UPSTREAM, 
                           DOWNSTREAM)

# Log conversion
LOG_PROMOTOR_DF_PATH = sprintf("%s/log.%s.filtered.promoters.minus%d_plus%d.df", 
                           OUTPUT_03, 
                           params$stage,
                           UPSTREAM, 
                           DOWNSTREAM)

IDR_DF = sprintf("../01_input/ELT2_%s_combined_IDR.df", params$stage) # peaks with signal agg

Promoters are upstream regions of all protein-coding genes

## Using saved promoter query.
## GRanges object with 6 ranges and 2 metadata columns:
##       seqnames      ranges strand |   wbps_gene_id external_gene_id
##          <Rle>   <IRanges>  <Rle> |    <character>      <character>
##   [1]     chrI 10031-11230      - | WBGene00022277           homt-1
##   [2]     chrI 10495-11694      + | WBGene00022276           nlp-40
##   [3]     chrI 26582-27781      - | WBGene00022278           rcor-1
##   [4]     chrI 32951-34150      - | WBGene00022279           sesn-1
##   [5]     chrI 42733-43932      + | WBGene00022275            txt-7
##   [6]     chrI 46461-47660      + | WBGene00044345        Y48G1C.12
##   -------
##   seqinfo: 7 sequences (1 circular) from ce11 genome

Aggregate signal using wiggletools

Install a conda environment containing wiggletools and ucsc user apps via root/David/01_promoters/02_scripts/conda_envs/elt-2-rev.yaml

Sys.setenv(PROMOTOR_BED_PATH=PROMOTOR_BED_PATH, # all promoters
           NR_PROMOTOR_BED_PATH = NR_PROMOTOR_BED_PATH, # overlapping removed
           IDR_BED = IDR_BED, 
           IDR_DF = IDR_DF, 
           SIGNAL_BW = SIGNAL_BW,
           PROMOTOR_DF_PATH = PROMOTOR_DF_PATH,
           NR_PROMOTOR_DF_PATH = NR_PROMOTOR_DF_PATH,
           STAGE=params$stage
           )

Run wiggletools in a bash session.

Read-in aggregated signal data

Read in the results of the wiggletools commands.

promoters.agg = read.table(PROMOTOR_DF_PATH)
colnames(promoters.agg) <- c("chrom", "start","end","len", "strand", "wbps_gene_id", "gene_name", "chip_signal_mean", "chip_signal_max")

IDR_peaks.agg = read.table(IDR_DF)

IDR_peaks.agg$V4 = NULL
IDR_peaks.agg$V5 = NULL
IDR_peaks.agg$V6 = NULL
IDR_peaks.agg$V8 = NULL
colnames(IDR_peaks.agg) <- c("chrom", "start","end","intensity","nlogq","offset","signal.mean","signal.max")

gr.IDR = makeGRangesFromDataFrame(IDR_peaks.agg,keep.extra.columns = T)
seqinfo(gr.IDR) <- Seqinfo(genome="ce11")
gr.IDR$signal.sum = width(gr.IDR )*gr.IDR$signal.mean


gr.promoters = makeGRangesFromDataFrame(promoters.agg,keep.extra.columns = T)
seqinfo(gr.promoters) <- Seqinfo(genome="ce11")

Attach log scale promoter signal values.

chipmean.minval = min(gr.promoters$chip_signal_mean,na.rm=T)
chipmean.minval
## [1] -149
chipmax.minval = min(gr.promoters$chip_signal_max,na.rm=T)
chipmax.minval
## [1] -91.3
chipmean.log = log(-chipmean.minval + 1 + gr.promoters$chip_signal_mean,base=2)
chipmax.log = log(-chipmax.minval + 1 + gr.promoters$chip_signal_max,base=2)

gr.promoters$log_chip_signal_mean = chipmean.log
gr.promoters$log_chip_signal_max = chipmax.log
head(gr.promoters)
## GRanges object with 6 ranges and 7 metadata columns:
##       seqnames      ranges strand |       len   wbps_gene_id   gene_name
##          <Rle>   <IRanges>  <Rle> | <integer>    <character> <character>
##   [1]     chrI 10031-11230      - |      1200 WBGene00022277      homt-1
##   [2]     chrI 10495-11694      + |      1200 WBGene00022276      nlp-40
##   [3]     chrI 26582-27781      - |      1200 WBGene00022278      rcor-1
##   [4]     chrI 32951-34150      - |      1200 WBGene00022279      sesn-1
##   [5]     chrI 42733-43932      + |      1200 WBGene00022275       txt-7
##   [6]     chrI 46461-47660      + |      1200 WBGene00044345   Y48G1C.12
##       chip_signal_mean chip_signal_max log_chip_signal_mean log_chip_signal_max
##              <numeric>       <numeric>            <numeric>           <numeric>
##   [1]          97.7057        510.4876              7.95330             9.23555
##   [2]         313.8040        643.6636              8.85781             9.52353
##   [3]          51.6907        106.8574              7.65700             7.63790
##   [4]          28.0552         38.9742              7.47732             7.03664
##   [5]          32.2047         89.4033              7.51053             7.50559
##   [6]          17.3023         26.3194              7.38752             6.89042
##   -------
##   seqinfo: 7 sequences (1 circular) from ce11 genome
# output file
LOG_PROMOTOR_DF_PATH = sprintf("%s/log_%s_filtered.promoters.minus%d_plus%d.df", OUTPUT_03, params$stage, UPSTREAM, DOWNSTREAM)
write.table(as.data.frame(gr.promoters), file = LOG_PROMOTOR_DF_PATH,quote=F, row.names=F,sep="\t")

Find overlaps between promoters and IDR peaks. Populate IDR signal fields when a peak exists, leave NaN otherwise.

laps = findOverlaps(gr.promoters,gr.IDR, ignore.strand=T,minoverlap = 100)

sprintf("%d/%d (%.1f) promoters have a peak.", length(unique(from(laps))), laps@nLnode,
                                               length(unique(from(laps)))/ laps@nLnode)
## [1] "5043/19985 (0.3) promoters have a peak."
sprintf("%d/%d (%.1f) peaks are in promoters", length(unique(to(laps))), laps@nRnode,
                                               length(unique(to(laps)))/ laps@nRnode)
## [1] "4608/9198 (0.5) peaks are in promoters"
gr.promoters$IDR_mean = NaN
gr.promoters$IDR_max = NaN
gr.promoters$IDR_value = NaN
gr.promoters$nlogq = NaN
gr.promoters$IDR_sum = NaN
gr.promoters[from(laps)]$IDR_max = gr.IDR[to(laps)]$signal.max
gr.promoters[from(laps)]$IDR_mean = gr.IDR[to(laps)]$signal.mean
gr.promoters[from(laps)]$IDR_value = gr.IDR[to(laps)]$intensity
gr.promoters[from(laps)]$IDR_sum = gr.IDR[to(laps)]$signal.sum
gr.promoters[from(laps)]$nlogq = gr.IDR[to(laps)]$nlogq
print("Number of promoters overlapping an IDR peak:")
## [1] "Number of promoters overlapping an IDR peak:"
sum(!is.nan(gr.promoters$IDR_max))
## [1] 5043
# the density/distribution of the non-transformed values is too thin to
# show on the graph, so this is commented out. If you want to see the distribution, 
# uncomment the ecdf plots.
idr.nonlog = as.data.frame(gr.promoters) %>% filter(is.finite(IDR_value)) %>% dplyr::select('IDR_value','IDR_mean','IDR_max','IDR_sum') %>% gather(key="dataset") 

# ggplot(idr.nonlog, aes(x=value, fill=dataset))  + geom_density(alpha=.5) + labs(title="Distributions of NON-Log transformed ChIP signal values in peaks")
# 
# idr.val.ecdf = ecdf(gr.promoters$IDR_value)
# idr.mean.ecdf = ecdf(gr.promoters$IDR_mean)
# idr.max.ecdf = ecdf(gr.promoters$IDR_max)
# 
# plot(idr.val.ecdf)
# lines(idr.mean.ecdf,col="blue")
# lines(idr.max.ecdf,col="red")

# the data currently have all positive values, so no adjustment made for log
idr.val.log = log10(gr.promoters$IDR_value)
idr.mean.log = log10(gr.promoters$IDR_mean)
idr.max.log = log10(gr.promoters$IDR_max)
idr.sum.log = log10(gr.promoters$IDR_sum)

# idr.val.log.ecdf = ecdf(idr.val.log)
# idr.mean.log.ecdf = ecdf(idr.mean.log) 
# idr.max.log.ecdf = ecdf(idr.max.log)
# 
# plot(idr.val.log.ecdf)
# lines(idr.mean.log.ecdf,col="blue")
# lines(idr.max.log.ecdf,col="red")

log.vals = data.frame(idr.mean = idr.mean.log, idr.val = idr.val.log, idr.max = idr.max.log, idr.sum = idr.sum.log)

long.log.vals = gather(log.vals, key="dataset")
head(long.log.vals)
ggplot(long.log.vals, aes(x=value, fill=dataset))  + geom_density(alpha=.5) + labs(title=sprintf("Distributions of log10 transformed IDR values for %s",params$stage))
## Warning: Removed 59768 rows containing non-finite values (stat_density).

gr.promoters$IDR_logTEN_max = idr.max.log
gr.promoters$IDR_logTEN_mean = idr.mean.log
gr.promoters$IDR_logTEN_value = idr.val.log
gr.promoters$IDR_logTEN_sum = idr.sum.log



extremes = gr.promoters$IDR_logTEN_sum > 5
extremes[is.na(extremes)] <- FALSE
gr.extremes = gr.promoters[extremes]
gr.extremes = gr.extremes[order(gr.extremes$IDR_logTEN_sum, decreasing = T)]
head(gr.extremes)
## GRanges object with 6 ranges and 16 metadata columns:
##       seqnames            ranges strand |       len   wbps_gene_id   gene_name
##          <Rle>         <IRanges>  <Rle> | <integer>    <character> <character>
##   [1]    chrII   2377652-2378851      - |      1200 WBGene00017521    F16G10.6
##   [2]     chrV   8767909-8769108      + |      1200 WBGene00015220     B0507.3
##   [3]    chrIV   3899504-3900703      + |      1200 WBGene00023478      srz-62
##   [4]     chrX 14790008-14791207      + |      1200 WBGene00009628      tatn-1
##   [5]    chrII     915080-916279      + |      1200 WBGene00019923     fbxc-29
##   [6]    chrII   4925049-4926248      - |      1200 WBGene00019895      aagr-2
##       chip_signal_mean chip_signal_max log_chip_signal_mean log_chip_signal_max
##              <numeric>       <numeric>            <numeric>           <numeric>
##   [1]          5915.25        15444.01              12.5664             13.9234
##   [2]          2305.98         3950.32              11.2622             11.9811
##   [3]          1910.68         4123.10              11.0090             12.0415
##   [4]          1516.05         2514.83              10.7023             11.3483
##   [5]          1538.22         3469.06              10.7214             11.7982
##   [6]          1751.17         3384.64              10.8928             11.7636
##        IDR_mean   IDR_max IDR_value     nlogq   IDR_sum IDR_logTEN_max
##       <numeric> <numeric> <numeric> <numeric> <numeric>      <numeric>
##   [1]   9558.01  15444.01   9719.27   3.12927   9529332        4.18876
##   [2]   3186.10   3950.32   2912.77   3.12927   1965824        3.59663
##   [3]   3184.86   4123.10   3796.41   3.12927   1859957        3.61522
##   [4]   1906.45   2514.83   1319.22   3.12927   1593795        3.40051
##   [5]   2869.27   3469.06   3165.58   3.12927   1526452        3.54021
##   [6]   2713.72   3384.64   2952.32   3.12927   1476266        3.52951
##       IDR_logTEN_mean IDR_logTEN_value IDR_logTEN_sum
##             <numeric>        <numeric>      <numeric>
##   [1]         3.98037          3.98763        6.97906
##   [2]         3.50326          3.46431        6.29354
##   [3]         3.50309          3.57937        6.26950
##   [4]         3.28023          3.12032        6.20243
##   [5]         3.45777          3.50045        6.18368
##   [6]         3.43357          3.47016        6.16916
##   -------
##   seqinfo: 7 sequences (1 circular) from ce11 genome

Read in RNA-seq data, join promoters by wbps geneid, and then sort logFoldChange high to low.

# input file
rnaseq = read.csv(RNASEQ)
rownames(rnaseq) <- rnaseq$WBGeneID

mcols(gr.promoters) <- mcols(gr.promoters) %>% 
  cbind(rnaseq[gr.promoters$wbps_gene_id,2:6])  %>% 
  as.data.frame() %>% 
  dplyr::rename(IDR_nlogq = nlogq)

names(gr.promoters) <- gr.promoters$wbps_gene_id

# sort promoters high to low by log2FC
gr.promoters = gr.promoters[order(gr.promoters$log2FoldChange,decreasing=T)]

gr.promoters[,2:5]
## GRanges object with 19985 ranges and 4 metadata columns:
##                  seqnames            ranges strand |   wbps_gene_id   gene_name
##                     <Rle>         <IRanges>  <Rle> |    <character> <character>
##   WBGene00016231     chrV   2579898-2581097      + | WBGene00016231     C29G2.3
##   WBGene00021593   chrIII   1754743-1755942      - | WBGene00021593   Y46E12A.3
##   WBGene00005522     chrV 18806310-18807509      - | WBGene00005522      sri-10
##   WBGene00020384     chrV   5349372-5350571      - | WBGene00020384     T09D3.3
##   WBGene00005548    chrII   2585394-2586593      - | WBGene00005548      sri-36
##              ...      ...               ...    ... .            ...         ...
##   WBGene00304895     chrX 16952760-16953959      - | WBGene00304895     F52G3.8
##   WBGene00303060     chrX 16962921-16964120      + | WBGene00303060     F52G3.7
##   WBGene00016901     chrX 17309908-17311107      + | WBGene00016901    C53C11.4
##   WBGene00303423     chrX 17361829-17363028      + | WBGene00303423    F10D7.11
##   WBGene00304237     chrX 17567279-17568478      - | WBGene00304237     F39B3.7
##                  chip_signal_mean chip_signal_max
##                         <numeric>       <numeric>
##   WBGene00016231         -25.3792       -14.51178
##   WBGene00021593         -21.1586       -17.62152
##   WBGene00005522         -13.0670        -3.70544
##   WBGene00020384         -29.8736       -18.35473
##   WBGene00005548          58.7131       103.57881
##              ...              ...             ...
##   WBGene00304895        -25.47366      -13.938541
##   WBGene00303060        100.15240      225.856720
##   WBGene00016901         -4.03903        0.056697
##   WBGene00303423        -16.65387       -2.584754
##   WBGene00304237        -18.73224       -7.003493
##   -------
##   seqinfo: 7 sequences (1 circular) from ce11 genome
# look at the number filtered by DESeq2
# as described by https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#pvaluesNA
baseMean_is_zero = rnaseq$baseMean == 0
pval_na = is.na(rnaseq$pvalue)
padj_na = is.na(rnaseq$padj)
# case one
sum(baseMean_is_zero & pval_na & padj_na)
## [1] 0
# case two
sum(!baseMean_is_zero & pval_na & padj_na)
## [1] 38
# case three
sum(!pval_na & padj_na)
## [1] 7713
# divide groups by peak and padj
enriched_intestine = gr.promoters$padj<.05 & !is.na(gr.promoters$padj) & gr.promoters$log2FoldChange > 0
has_peak = !is.nan(gr.promoters$IDR_max)
classA = enriched_intestine & has_peak
classB = !enriched_intestine & has_peak
classC = enriched_intestine & !has_peak
classD = !enriched_intestine & !has_peak

m = matrix( c(sum(classA),
            sum(classB),
            sum(classC),
            sum(classD)), ncol = 2, byrow = T)
colnames(m) <- c("Peak","NO peak")
rownames(m) <- c("Int. expressed","NOT Int. expressed")
m
##                    Peak NO peak
## Int. expressed     1767    3276
## NOT Int. expressed 1170   13772
m.chisq = chisq.test(m)
m.chisq
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  m
## X-squared = 2224, df = 1, p-value <2e-16
display_table = rbind(m,m.chisq$expected)
pval.label=ifelse(m.chisq$p.value <2e-16, "<2e-16", sprintf("%.3f", .chisq$p.value))
knitr::kable(display_table, 
             digits = 1,
             format.args=list(big.mark = ','),
             caption=sprintf("%s: Chi-sq p-val: %s",
                                params$stage, 
                                pval.label)) %>% 
      pack_rows(index = c("Observed" = 2, "Expected" = 2)) %>%
      kable_styling(latex_options = "scale_down")
L3: Chi-sq p-val: <2e-16
Peak NO peak
Observed
Int. expressed 1,767 3,276
NOT Int. expressed 1,170 13,772
Expected
Int. expressed 741 4,302
NOT Int. expressed 2,196 12,746
gr.promoters$class = "classA"
gr.promoters$class[classB] <- "classB"
gr.promoters$class[classC] <- "classC"
gr.promoters$class[classD] <- "classD"
sigmoid = function(z) {1 / (1 + exp(-z))}

dataset = as.data.frame(mcols(gr.promoters))
dataset$enriched = ifelse( dataset$padj < .05, 1, 0)
dataset = dataset %>% dplyr::filter(is.finite(enriched)) %>% arrange(log_chip_signal_mean)

#dataset = dataset[11:(nrow(dataset)-10),]


mod = glm(enriched ~ log_chip_signal_max, data=dataset[11:(nrow(dataset)-10),], family = "binomial")
dataset$prediction = sigmoid(predict(mod,newdata=dataset))

curve = data.frame(
  log_chip_signal_max=seq(
    min(dataset$log_chip_signal_max),
    max(dataset$log_chip_signal_max),
    length.out=50))
curve$y = sigmoid(predict(mod, newdata=curve))


dataset=dataset %>% arrange(log_chip_signal_max)
ggplot(dataset[11:(nrow(dataset)-10),], aes(x=log_chip_signal_max, y=enriched)) + 
  geom_jitter(alpha=.1) + 
  geom_line(data=curve, aes(x=log_chip_signal_max, y=y),inherit.aes=F) +
  labs(title=sprintf("%s: Does promoter signal max predict intestine expression?",params$stage),
       y="pr(intestine enriched)")

mod = glm(enriched ~ log_chip_signal_mean, data=dataset[11:(nrow(dataset)-10),], family = "binomial")
dataset$prediction = sigmoid(predict(mod,newdata=dataset))

curve = data.frame(
  log_chip_signal_mean=seq(
    min(dataset$log_chip_signal_mean),
    max(dataset$log_chip_signal_mean),
    length.out=50))
curve$y = sigmoid(predict(mod, newdata=curve))

dataset=dataset %>% arrange(log_chip_signal_mean)
ggplot(dataset[11:(nrow(dataset)-10),], aes(x=log_chip_signal_mean, y=enriched)) + 
  geom_point(alpha=.1) + 
  geom_line(data=curve, aes(x=log_chip_signal_mean, y=y),inherit.aes=F) +
  labs(title=sprintf("%s: Does promoter signal mean/sum predict intestine expression?",params$stage),y="pr(intestine enriched)") 

ROC = function(scores, truth) {
  nPos = sum(truth)
  nNeg = sum(!truth)
  
  # True negatives
  TN = function(i) {
    ecdf(scores[!truth])(i) * nNeg
  }
  # False negatives
  FN = function(i) {
    ecdf(scores[truth])(i) * nPos
  }
  # False positives
  FP = function(i) {
    nNeg - TN(i)
  }
  # True positives
  TP = function(i) {
    nPos - FN(i)
  }
  FPR = function(i) {
    FP(i)/(FP(i) + TN(i))
  }
  # Sensitivity
  Sn = function(i) {
    TP(i)/(TP(i) + FN(i)) # efficiency of recovering True Positives
  }
  Sp = function(i) {
    TN(i)/(TN(i) + FP(i)) # success at excluding set B (True Negatives)
  }
  # Positive predictive value
  PPV = function(i) {
    TP(i)/(TP(i) + FP(i)) # proportion of called positives that are correct
  }
  # False positive rate
  FPR = function(i) {
    FP(i)/(FP(i)+TN(i)) # failure to exclude set B (True Negatives)
  }
  list(Sp=Sp,Sn=Sn,TP=TP,FP=FP,TN=TN,FN=FN,PPV=PPV,TPR=Sn,FPR=FPR)
}

dataset = as.data.frame(mcols(gr.promoters[classA|classB]) )
dataset = dataset[order(dataset$IDR_logTEN_sum),]
dataset$class = factor(dataset$class, levels=c("classA","classB"))
thresholds = seq(min(dataset$IDR_logTEN_sum), max(dataset$IDR_logTEN_sum), length.out = 100)


ROC.IDR.sum = ROC(dataset$IDR_logTEN_sum, dataset$class == 'classA')
TPR=ROC.IDR.sum$TPR
FPR=ROC.IDR.sum$FPR
TP = ROC.IDR.sum$TP
FP = ROC.IDR.sum$FP

TPRvsFPR = TPR(thresholds) - FPR(thresholds)
best=max(TPRvsFPR)
best_threshold = thresholds[ which(TPRvsFPR == best)]

ggplot(data.frame(threshold=thresholds,TPRvsFPR=TPRvsFPR), 
       aes(x=threshold,y=TPRvsFPR)) + 
  geom_line() +
  geom_vline(xintercept=best_threshold) +
  ggtitle(sprintf("%s: Using IDR sum score to recover classA (TPR) \nand exclude classB (FPR) ", params$stage)) + ylab("TPR - FPR")

df = data.frame()
for (funcname in names(ROC.IDR.sum)) {
  f = ROC.IDR.sum[[funcname]]
  df = df %>% rbind(  data.frame(threshold=thresholds,measure=funcname,value=f(thresholds)))
}
df2 = df %>% filter(measure %in% c("TPR","FPR"))
df2$measure = factor(df2$measure, levels=c("TPR","FPR"))             
ggplot(df2, aes(x=threshold,y=value, color=measure)) + geom_line() +
  scale_color_manual(values=c("black","red")) +
  geom_segment(x=best_threshold,xend=best_threshold,
               y=FPR(best_threshold), yend=TPR(best_threshold),color='black') +
  geom_point(x=best_threshold, y=TPR(best_threshold), color="black", size=.75) +
  geom_point(x=best_threshold, y=FPR(best_threshold), color="black", size=.75) +
  ggtitle("Positive predictive value of using peak signal to predict binding") +
  geom_text(inherit.aes=F,aes(x=best_threshold, y=TPR(best_threshold)), 
            position=position_nudge(x=.6), label=sprintf("recover: %d/%d classA\nexclude: %d/%d classB", TP(best_threshold),sum(dataset$class == 'classA'), FP(best_threshold),sum(dataset$class == 'classB')))

data.frame(IDR_sum=thresholds, TPR=ROC.IDR.sum$TPR(thresholds), FPR=ROC.IDR.sum$FPR(thresholds)) %>%
  ggplot(aes(x=FPR,y=TPR)) + geom_line() +
  geom_point(x=FPR(best_threshold),y=TPR(best_threshold), size=.75) +
  geom_abline(slope=1) + ggtitle("ROC for IDR sum")

ROC.df = data.frame(TPR=ROC.IDR.sum$TPR(thresholds),FPR=ROC.IDR.sum$FPR(thresholds),score="IDR sum")
IDR_SUM_BEST = which(thresholds == best_threshold)
thresholds = seq(min(dataset$IDR_logTEN_max), max(dataset$IDR_logTEN_max), length.out = 100)


ROC.IDR.max = ROC(dataset$IDR_logTEN_max, dataset$class == 'classA')
TPR=ROC.IDR.max$TPR
FPR=ROC.IDR.max$FPR
TP = ROC.IDR.max$TP
FP = ROC.IDR.max$FP

TPRvsFPR = TPR(thresholds) - FPR(thresholds)
best=max(TPRvsFPR)
best_threshold = thresholds[ which(TPRvsFPR == best)]

ggplot(data.frame(threshold=thresholds,TPRvsFPR=TPRvsFPR), 
       aes(x=threshold,y=TPRvsFPR)) + 
  geom_line() +
  geom_vline(xintercept=best_threshold) +
  ggtitle(sprintf("%s: Using IDR max score to recover classA (TPR) \nand exclude classB (FPR) ", params$stage)) + ylab("TPR - FPR")

df = data.frame()
for (funcname in names(ROC.IDR.max)) {
  f = ROC.IDR.max[[funcname]]
  df = df %>% rbind(  data.frame(threshold=thresholds,measure=funcname,value=f(thresholds)))
}
df2 = df %>% filter(measure %in% c("TPR","FPR"))
df2$measure = factor(df2$measure, levels=c("TPR","FPR"))             
ggplot(df2, aes(x=threshold,y=value, color=measure)) + geom_line() +
  scale_color_manual(values=c("black","red")) +
  geom_segment(x=best_threshold,xend=best_threshold,
               y=FPR(best_threshold), yend=TPR(best_threshold),color='black') +
  geom_point(x=best_threshold, y=TPR(best_threshold), color="black", size=.75) +
  geom_point(x=best_threshold, y=FPR(best_threshold), color="black", size=.75) +
  ggtitle("Positive predictive value of using peak signal to predict binding") +
  geom_text(inherit.aes=F,aes(x=best_threshold, y=TPR(best_threshold)), 
            position=position_nudge(x=.6), label=sprintf("recover: %d/%d classA\nexclude: %d/%d classB", 
                   round(TP(best_threshold)),
                   sum(dataset$class == 'classA'), 
                   round(FP(best_threshold)),
                   sum(dataset$class == 'classB')))

data.frame(IDR_max=thresholds, TPR=ROC.IDR.max$TPR(thresholds), FPR=ROC.IDR.max$FPR(thresholds)) %>%
  ggplot(aes(x=FPR,y=TPR)) + geom_line() +
  geom_point(x=FPR(best_threshold),y=TPR(best_threshold), size=.75) +
  geom_abline(slope=1)

ROC.df = ROC.df %>% 
  rbind(data.frame(TPR=ROC.IDR.max$TPR(thresholds),FPR=ROC.IDR.max$FPR(thresholds),score="IDR max"))
IDR_MAX_BEST = which(thresholds == best_threshold)

ggplot(ROC.df, aes(x=FPR,y=TPR, color=score)) + geom_line() + geom_abline(slope=1)

thresholds = seq(min(dataset$IDR_logTEN_value), max(dataset$IDR_logTEN_value), length.out = 100)


ROC.IDR.value = ROC(dataset$IDR_logTEN_value, dataset$class == 'classA')
TPR=ROC.IDR.value$TPR
FPR=ROC.IDR.value$FPR
TP = ROC.IDR.value$TP
FP = ROC.IDR.value$FP

TPRvsFPR = TPR(thresholds) - FPR(thresholds)
best=max(TPRvsFPR)
best_threshold = thresholds[ which(TPRvsFPR == best)]

ROC.df = ROC.df %>% 
  rbind(data.frame(TPR=ROC.IDR.value$TPR(thresholds),FPR=ROC.IDR.value$FPR(thresholds),score="IDR value"))
IDR_VALUE_BEST = which(thresholds == best_threshold)

ggplot(ROC.df, aes(x=FPR,y=TPR, color=score)) + geom_line() + geom_abline(slope=1) + ggtitle(sprintf("%s: ROC predictive power of IDR score to predict intestine expression", params$stage))

thresholds = seq(min(dataset$IDR_logTEN_mean), max(dataset$IDR_logTEN_mean), length.out = 100)


ROC.IDR.mean = ROC(dataset$IDR_logTEN_mean, dataset$class == 'classA')
TPR=ROC.IDR.mean$TPR
FPR=ROC.IDR.mean$FPR
TP = ROC.IDR.mean$TP
FP = ROC.IDR.mean$FP

TPRvsFPR = TPR(thresholds) - FPR(thresholds)
best=max(TPRvsFPR)
best_threshold = thresholds[ which(TPRvsFPR == best)]

ROC.df = ROC.df %>% 
  rbind(data.frame(TPR=ROC.IDR.mean$TPR(thresholds),FPR=ROC.IDR.mean$FPR(thresholds),score="IDR mean"))
IDR_MEAN_best = which(thresholds == best_threshold)

ggplot(ROC.df, aes(x=FPR,y=TPR, color=score)) + geom_line() + geom_abline(slope=1) + ggtitle(sprintf("%s ROC: predictive power of IDR score to predict intestine expression", params$stage))

kable(ROC.df %>% group_by(score) %>% summarize(best = max(TPR-FPR)), caption=sprintf("%s best TPR-FPR", params$stage), digits=3)
L3 best TPR-FPR
score best
IDR max 0.289
IDR mean 0.294
IDR sum 0.242
IDR value 0.252
library(MASS)

# IDR sum in peak
classAB.lda = lda(as.matrix(dataset$IDR_logTEN_sum), dataset$class)
dataset$IDR_logTEN_sum.ghat = predict(classAB.lda)$class
mean(dataset$IDR_logTEN_sum.ghat != dataset$class)
## [1] 0.336
ggplot(dataset, aes(x=IDR_logTEN_sum, fill=class)) + geom_density(alpha=.5)

ggplot(dataset, aes(x=IDR_logTEN_mean, fill=class)) + geom_density(alpha=.5)

classAB.lda = lda(as.matrix(dataset$IDR_logTEN_max), dataset$class)
dataset$IDR_logTEN_max.ghat = predict(classAB.lda)$class

mean(dataset$IDR_logTEN_max.ghat != dataset$class)
## [1] 0.332
changepoint = dataset %>% filter(IDR_logTEN_max.ghat == 'classB') %>% pull(IDR_logTEN_max) %>% max()

total_greater_changepoint = dataset %>% 
  filter(IDR_logTEN_max > changepoint) %>% 
  nrow()
frac = dataset %>% filter(IDR_logTEN_max > changepoint) %>% 
  group_by(class) %>% 
  summarize(count=n(), frac=n()/total_greater_changepoint) %>% 
  filter(class == "classA") %>% pull(frac)

ggplot(dataset, aes(x=IDR_logTEN_max, fill=class)) + 
  geom_density(alpha=.5) + 
  geom_vline(xintercept=changepoint) + 
  scale_fill_brewer(palette="Spectral")

dataset=dataset %>% arrange(IDR_logTEN_max)
isClassA = dataset$class == "classA"
incrClassA = cumsum(isClassA)
incrClassB = cumsum(!isClassA)
dataset$incrClassA = incrClassA
dataset$incrClassB = incrClassB
dataset$fClassA = incrClassA/(1:nrow(dataset))
ggplot(dataset, aes(x=IDR_logTEN_max, y=incrClassA)) + geom_line() + geom_line(aes(y=incrClassB),color='orange')

thresholds = seq(min(dataset$IDR_logTEN_max), max(dataset$IDR_logTEN_max), length.out=24)
f.all = function(x) { 1-ecdf(dataset$IDR_logTEN_max)(x) } 
called_positives = f.all(thresholds)*nrow(dataset)
f.classA = function(x) {1-ecdf(dataset$IDR_logTEN_max[dataset$class == 'classA'])(x)}
actual_positives = f.classA(thresholds)*sum(dataset$class == 'classA')
dataset$enriched = ifelse( dataset$padj < .05, 1, 0)

dataset = dataset %>% filter(is.finite(enriched)) %>% arrange(IDR_logTEN_sum)

mod = glm(enriched ~ IDR_logTEN_sum, data=dataset[11:(nrow(dataset)-10),], family = "binomial")

curve = data.frame(
  IDR_logTEN_sum=seq(
    min(dataset$IDR_logTEN_sum),
    max(dataset$IDR_logTEN_sum),
    length.out=50))
curve$y = sigmoid(predict(mod, newdata=curve))

dataset$prediction = sigmoid(predict(mod, newdata=dataset))
dataset=dataset %>% arrange(IDR_logTEN_sum)
ggplot(dataset, aes(x=IDR_logTEN_sum, y=enriched)) + 
  geom_point(alpha=.3) + 
  geom_line(data=curve, aes(x=IDR_logTEN_sum, y=y),inherit.aes=F) +  labs(title=sprintf("%s: Does peak signal sum predict intestine expression?",params$stage), y="pr(intestine enriched)")

mod = glm(enriched ~ IDR_logTEN_mean, data=dataset[11:(nrow(dataset)-10),], family = "binomial")

curve = data.frame(
  IDR_logTEN_mean=seq(
    min(dataset$IDR_logTEN_mean),
    max(dataset$IDR_logTEN_mean),
    length.out=50))
curve$y = sigmoid(predict(mod, newdata=curve))

dataset$prediction = sigmoid(predict(mod,newdata=dataset))
dataset=dataset %>% arrange(IDR_logTEN_mean)
ggplot(dataset, aes(x=IDR_logTEN_mean, y=enriched)) + 
  geom_point(alpha=.3) + 
  geom_line(data=curve, aes(x=IDR_logTEN_mean, y=y),inherit.aes=F) + 
  labs(title=sprintf("%s: Does peak signal mean predict intestine expression?",params$stage),y="pr(intestine enriched)")

mod = glm(enriched ~ IDR_logTEN_max, data=dataset, family = "binomial")

curve = data.frame(
  IDR_logTEN_max=seq(
    min(dataset$IDR_logTEN_max),
    max(dataset$IDR_logTEN_max),
    length.out=50))
curve$y = sigmoid(predict(mod, newdata=curve))

dataset$prediction = sigmoid(predict(mod))
dataset=dataset %>% arrange(IDR_logTEN_max)
ggplot(dataset, aes(x=IDR_logTEN_max, y=enriched)) + 
  geom_point(alpha=.3) + 
    geom_line(data=curve, aes(x=IDR_logTEN_max, y=y),inherit.aes=F) + 
  labs(title=sprintf("%s: Does peak signal max predict intestine expression?",params$stage),y="pr(intestine enriched)")

mod = glm(enriched ~ IDR_logTEN_value, data=dataset, family = "binomial")

curve = data.frame(
  IDR_logTEN_value=seq(
    min(dataset$IDR_logTEN_value),
    max(dataset$IDR_logTEN_value),
    length.out=50))
curve$y = sigmoid(predict(mod, newdata=curve))

dataset$prediction = sigmoid(predict(mod))
dataset=dataset %>% arrange(IDR_logTEN_value)
ggplot(dataset, aes(x=IDR_logTEN_value, y=enriched)) + 
  geom_point(alpha=.3) + 
      geom_line(data=curve, aes(x=IDR_logTEN_value, y=y),inherit.aes=F) +
  labs(title=sprintf("%s: Does peak IDR value predict intestine expression?",params$stage), y="pr(intestine enriched)")

promoters.hilo = as.data.frame(gr.promoters)

PROMOTERS_HILO_BED_PATH = file.path(OUTPUT_03, sprintf("%s.promoters.hilo.bed", params$stage))
PROMOTERS_HILO_TSV_PATH = file.path(OUTPUT_03, sprintf("%s.promoters.hilo.tsv", params$stage))
PROMOTERS_HILO_BED_PATH_A = file.path(OUTPUT_03, sprintf("%s.promoters.hilo.classA.bed", params$stage))
PROMOTERS_HILO_BED_PATH_B = file.path(OUTPUT_03, sprintf("%s.promoters.hilo.classB.bed", params$stage))
PROMOTERS_HILO_BED_PATH_C = file.path(OUTPUT_03, sprintf("%s.promoters.hilo.classC.bed", params$stage))
PROMOTERS_HILO_BED_PATH_D = file.path(OUTPUT_03, sprintf("%s.promoters.hilo.classD.bed", params$stage))


# BED format
write.table(promoters.hilo, PROMOTERS_HILO_BED_PATH, quote=F, sep="\t", row.names=F, col.names=F)

# Matrix format readable into R
write.table(promoters.hilo, PROMOTERS_HILO_TSV_PATH, quote=F, sep="\t", row.names=T, col.names=T)

write.table(promoters.hilo[classA,], 
            PROMOTERS_HILO_BED_PATH_A, quote=F, sep="\t", row.names=F, col.names=F)
write.table(promoters.hilo[classB,], 
            PROMOTERS_HILO_BED_PATH_B, quote=F, sep="\t", 
row.names=F, col.names=F)
write.table(promoters.hilo[classC,], 
            PROMOTERS_HILO_BED_PATH_C, quote=F, sep="\t", 
row.names=F, col.names=F)
write.table(promoters.hilo[classD,], 
            PROMOTERS_HILO_BED_PATH_D, quote=F, sep="\t", 
row.names=F, col.names=F)

#### deeptooling up versus down only, no other filters
promoters.hilo.up = promoters.hilo %>% dplyr::filter(log2FoldChange > 0)
promoters.hilo.down = promoters.hilo %>% dplyr::filter(log2FoldChange < 0)

PROMOTERS_HILO_BED_PATH_UP = file.path(OUTPUT_03, 
                                       sprintf("%s.promoters.hilo.up.bed",params$stage))
PROMOTERS_HILO_BED_PATH_DOWN = file.path(OUTPUT_03, 
                                         sprintf("%s.promoters.hilo.down.bed",params$stage))

write.table(promoters.hilo.up, 
            PROMOTERS_HILO_BED_PATH_UP, 
            quote=F, 
            sep="\t", 
row.names=F, col.names=F)

write.table(promoters.hilo.down, 
            PROMOTERS_HILO_BED_PATH_DOWN, 
            quote=F, 
            sep="\t", 
row.names=F, col.names=F)
wtf.wbid = read.table('../01_input/wtf3.wbid')[[1]]
in_WTF = gr.promoters$wbps_gene_id %in% wtf.wbid 

PROMOTERS_HILO_BED_PATH_A_TF = file.path(OUTPUT_03, sprintf("%s.promoters.hilo.classA.TF.bed", params$stage))
PROMOTERS_HILO_BED_PATH_B_TF = file.path(OUTPUT_03, sprintf("%s.promoters.hilo.classB.TF.bed", params$stage))
PROMOTERS_HILO_BED_PATH_C_TF = file.path(OUTPUT_03, sprintf("%s.promoters.hilo.classC.TF.bed", params$stage))
PROMOTERS_HILO_BED_PATH_D_TF = file.path(OUTPUT_03, sprintf("%s.promoters.hilo.classD.TF.bed", params$stage))

write.table(promoters.hilo[classA&in_WTF,], 
            PROMOTERS_HILO_BED_PATH_A_TF, quote=F, sep="\t", row.names=F, col.names=F)
write.table(promoters.hilo[classB&in_WTF,], 
            PROMOTERS_HILO_BED_PATH_B_TF, quote=F, sep="\t", 
row.names=F, col.names=F)
write.table(promoters.hilo[classC&in_WTF,], 
            PROMOTERS_HILO_BED_PATH_C_TF, quote=F, sep="\t", 
row.names=F, col.names=F)
write.table(promoters.hilo[classD&in_WTF,], 
            PROMOTERS_HILO_BED_PATH_D_TF, quote=F, sep="\t", 
row.names=F, col.names=F)

To produce the deeptools output, execute DEEPTOOLS.bash.

It will compute promoters.hilo.mx and promoters.hilo.pdf.

Deeptools PDFs indicate a font called dejavu, if you’re tired of replacing it in Illustrator, install it from: https://sourceforge.net/projects/dejavu/

# data and params
Sys.setenv(UPSTREAM=UPSTREAM,
           DOWNSTREAM=DOWNSTREAM,
           INTERP_SIGNAL_BW=INTERP_SIGNAL_BW,
            PROMOTERS_HILO_BED_PATH=PROMOTERS_HILO_BED_PATH,
            PROMOTERS_HILO_BED_PATH_A=PROMOTERS_HILO_BED_PATH_A,
            PROMOTERS_HILO_BED_PATH_B=PROMOTERS_HILO_BED_PATH_B,
            PROMOTERS_HILO_BED_PATH_C=PROMOTERS_HILO_BED_PATH_C,
            PROMOTERS_HILO_BED_PATH_D=PROMOTERS_HILO_BED_PATH_D,
            PROMOTERS_HILO_BED_PATH_A_TF=PROMOTERS_HILO_BED_PATH_A_TF,
            PROMOTERS_HILO_BED_PATH_B_TF=PROMOTERS_HILO_BED_PATH_B_TF,
            PROMOTERS_HILO_BED_PATH_C_TF=PROMOTERS_HILO_BED_PATH_C_TF,
            PROMOTERS_HILO_BED_PATH_D_TF=PROMOTERS_HILO_BED_PATH_D_TF)

# output files
PROMOTERS_HILO_UPDOWN_MX = file.path(OUTPUT_03,
                    sprintf("%s.promoters.hilo.updown.mx",params$stage))
PROMOTERS_CLASSES_UPDOWN_MX = file.path(OUTPUT_03,
                    sprintf("%s.promoters.hilo.CLASSES.mx",params$stage))
PROMOTERS_TF_CLASSES_UPDOWN_MX = file.path(OUTPUT_03,
                    sprintf("%s.promoters.hilo.TF.CLASSES.mx",params$stage))
PROMOTERS_HILO_UPDOWN_PDF = file.path(OUTPUT_03,
                    sprintf("%s.promoters.hilo.updown.pdf",params$stage))

PROMOTERS_CLASSES_UPDOWN_PDF = file.path(OUTPUT_03,                                                       sprintf("%s.promoters.hilo.CLASSES.pdf",params$stage))
PROMOTERS_TF_CLASSES_UPDOWN_PDF = file.path(OUTPUT_03,                                                       sprintf("%s.promoters.hilo.TF.CLASSES.pdf",params$stage))

Sys.setenv(PROMOTERS_HILO_BED_PATH_UP=PROMOTERS_HILO_BED_PATH_UP,
            PROMOTERS_HILO_BED_PATH_DOWN=PROMOTERS_HILO_BED_PATH_DOWN,
            PROMOTERS_HILO_UPDOWN_MX=PROMOTERS_HILO_UPDOWN_MX,
            PROMOTERS_CLASSES_UPDOWN_MX=PROMOTERS_CLASSES_UPDOWN_MX,
            PROMOTERS_HILO_UPDOWN_PDF=PROMOTERS_HILO_UPDOWN_PDF,
            PROMOTERS_CLASSES_UPDOWN_PDF=PROMOTERS_CLASSES_UPDOWN_PDF,
           PROMOTERS_TF_CLASSES_UPDOWN_MX=PROMOTERS_TF_CLASSES_UPDOWN_MX,
           PROMOTERS_TF_CLASSES_UPDOWN_PDF=PROMOTERS_TF_CLASSES_UPDOWN_PDF
           )
source $HOME/.bash_profile
conda activate derptools # yaml environ in 02_scripts/conda_envs
pwd

set -ue # exit 1 if any vars are not set (using Sys.setenv in prev chunks)
echo PROMOTERS_HILO_BED_PATH $PROMOTERS_HILO_BED_PATH
echo PROMOTERS_HILO_BED_PATH_A_TF $PROMOTERS_HILO_BED_PATH_A_TF
echo PROMOTERS_HILO_BED_PATH_B_TF $PROMOTERS_HILO_BED_PATH_B_TF
echo PROMOTERS_HILO_BED_PATH_C_TF $PROMOTERS_HILO_BED_PATH_C_TF
echo PROMOTERS_HILO_BED_PATH_D_TF $PROMOTERS_HILO_BED_PATH_D_TF
echo PROMOTERS_TF_CLASSES_UPDOWN_MX $PROMOTERS_TF_CLASSES_UPDOWN_MX
echo PROMOTERS_TF_CLASSES_UPDOWN_PDF $PROMOTERS_TF_CLASSES_UPDOWN_PDF
BODYLENGTH=$(($UPSTREAM + $DOWNSTREAM))

# real  1m59.354s
# user  3m47.980s
# sys   0m2.663s
if true
then
time computeMatrix scale-regions --regionBodyLength $BODYLENGTH \
                                --startLabel 'up-1Kb' \
                                --endLabel down+200 \
                                --beforeRegionStartLength $UPSTREAM\
                                --afterRegionStartLength $DOWNSTREAM\
                                -R $PROMOTERS_HILO_BED_PATH_A_TF $PROMOTERS_HILO_BED_PATH_B_TF $PROMOTERS_HILO_BED_PATH_C_TF $PROMOTERS_HILO_BED_PATH_D_TF\
                                -S $INTERP_SIGNAL_BW\
                                -p 4 -o $PROMOTERS_TF_CLASSES_UPDOWN_MX
fi
plotHeatmap --plotTitle "WTF class A-D regions sorted by logFC expression high to low for ${STAGE}"\
            --matrixFile $PROMOTERS_TF_CLASSES_UPDOWN_MX\
             -out $PROMOTERS_TF_CLASSES_UPDOWN_PDF\
             --sortRegions no\
             --colorMap RdYlBu_r\
             --startLabel '' --endLabel ''\
             --regionsLabel 'peak+int. enrich.' 'peak+ NOT int. enrich.' 'NO peak + int. enrich.' 'NO peak + NOT int. enrich.'\
             --samplesLabel 'ELT-2 signal (reps. combined subtracted)'\
gr.promoters.classA = gr.promoters[classA]

# scatter plot with linear mods on logFC up and down separately
gr.promoters.classA %>% as.data.frame() %>% 
  ggplot(
    aes(x=log_chip_signal_max, 
        y=log2FoldChange,
        group=log2FoldChange>0)) + geom_point() +
        geom_smooth(method='lm', formula= y~x) +
        ggtitle("Peak + Intestine Enriched") 

classA.up = promoters.hilo %>% as.data.frame() %>% dplyr::filter(classA & log2FoldChange > 0)
up.table = classA.up[,c('log2FoldChange',
                      'chip_signal_mean',
                      'chip_signal_max',
                      'log_chip_signal_mean', 
                      'log_chip_signal_max',  
                      'IDR_mean',   'IDR_max', 'IDR_value')]

cor.up.table = cor(up.table)
options(digits=3)
knitr::kable(cor.up.table, caption="Pairwise correlations")
Pairwise correlations
log2FoldChange chip_signal_mean chip_signal_max log_chip_signal_mean log_chip_signal_max IDR_mean IDR_max IDR_value
log2FoldChange 1.000 0.128 0.137 0.141 0.154 0.151 0.144 0.161
chip_signal_mean 0.128 1.000 0.956 0.954 0.894 0.919 0.922 0.794
chip_signal_max 0.137 0.956 1.000 0.918 0.929 0.960 0.965 0.877
log_chip_signal_mean 0.141 0.954 0.918 1.000 0.959 0.902 0.887 0.756
log_chip_signal_max 0.154 0.894 0.929 0.959 1.000 0.920 0.902 0.798
IDR_mean 0.151 0.919 0.960 0.902 0.920 1.000 0.994 0.924
IDR_max 0.144 0.922 0.965 0.887 0.902 0.994 1.000 0.919
IDR_value 0.161 0.794 0.877 0.756 0.798 0.924 0.919 1.000
cor.test(classA.up[,'log2FoldChange'],classA.up[,'IDR_mean'])
## 
##  Pearson's product-moment correlation
## 
## data:  classA.up[, "log2FoldChange"] and classA.up[, "IDR_mean"]
## t = 6, df = 1765, p-value = 2e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.105 0.196
## sample estimates:
##   cor 
## 0.151
cor.test(classA.up[,'log2FoldChange'],classA.up[,'log_chip_signal_mean'])
## 
##  Pearson's product-moment correlation
## 
## data:  classA.up[, "log2FoldChange"] and classA.up[, "log_chip_signal_mean"]
## t = 6, df = 1765, p-value = 3e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0949 0.1863
## sample estimates:
##   cor 
## 0.141
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Monterey 12.2.1
## 
## Matrix products: default
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] MASS_7.3-55            dplyr_1.0.8            kableExtra_1.3.4      
##  [4] tidyr_1.2.0            ggplot2_3.3.5          GenomicRanges_1.46.1  
##  [7] GenomeInfoDb_1.30.1    IRanges_2.28.0         S4Vectors_0.32.3      
## [10] BiocGenerics_0.40.0    biomaRt_2.50.3         ParasiteXML_0.0.0.9000
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-155           bitops_1.0-7           bit64_4.0.5           
##  [4] filelock_1.0.2         webshot_0.5.2          RColorBrewer_1.1-2    
##  [7] progress_1.2.2         httr_1.4.2             tools_4.1.2           
## [10] bslib_0.3.1            utf8_1.2.2             R6_2.5.1              
## [13] DBI_1.1.2              mgcv_1.8-38            colorspace_2.0-3      
## [16] withr_2.4.3            tidyselect_1.1.2       prettyunits_1.1.1     
## [19] bit_4.0.4              curl_4.3.2             compiler_4.1.2        
## [22] cli_3.2.0              rvest_1.0.2            Biobase_2.54.0        
## [25] xml2_1.3.3             labeling_0.4.2         sass_0.4.0            
## [28] scales_1.1.1           rappdirs_0.3.3         systemfonts_1.0.4     
## [31] stringr_1.4.0          digest_0.6.29          rmarkdown_2.11        
## [34] svglite_2.1.0          XVector_0.34.0         pkgconfig_2.0.3       
## [37] htmltools_0.5.2        dbplyr_2.1.1           fastmap_1.1.0         
## [40] highr_0.9              rlang_1.0.1            rstudioapi_0.13       
## [43] RSQLite_2.2.10         jquerylib_0.1.4        farver_2.1.0          
## [46] generics_0.1.2         jsonlite_1.7.3         RCurl_1.98-1.6        
## [49] magrittr_2.0.2         GenomeInfoDbData_1.2.7 Matrix_1.4-0          
## [52] Rcpp_1.0.8             munsell_0.5.0          fansi_1.0.2           
## [55] lifecycle_1.0.1        stringi_1.7.6          yaml_2.3.5            
## [58] zlibbioc_1.40.0        BiocFileCache_2.2.1    grid_4.1.2            
## [61] blob_1.2.2             crayon_1.5.0           lattice_0.20-45       
## [64] Biostrings_2.62.0      splines_4.1.2          hms_1.1.1             
## [67] KEGGREST_1.34.0        knitr_1.37             pillar_1.7.0          
## [70] XML_3.99-0.8           glue_1.6.1             evaluate_0.15         
## [73] png_0.1-7              vctrs_0.3.8            gtable_0.3.0          
## [76] purrr_0.3.4            assertthat_0.2.1       cachem_1.0.6          
## [79] xfun_0.29              viridisLite_0.4.0      tibble_3.1.6          
## [82] AnnotationDbi_1.56.2   memoise_2.0.1          ellipsis_0.3.2